Regresión logística utilizando la base de datos del Titanic Librería para análisis gráfico de los valores presentes y perdidos

library(Amelia)   #'Permite la representación visual del dataset
## Warning: package 'Amelia' was built under R version 3.3.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.3.3
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(dplyr)    #'Extensión dedicada al análisis de datasets
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2) #'Extensión utilizada para el ajuste gráfico
## Warning: package 'reshape2' was built under R version 3.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(plotly)
## Warning: package 'plotly' was built under R version 3.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
df.train<- read.csv('~/R/train.csv',header=T,na.strings=c(""))

Contar número de valores perdidos

sapply(df.train,function(x) sum(is.na(x)))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

Contar número de valores presentes

sapply(df.train, function(x) length(unique(x)))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##         891           2           3         891           2          89 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           7           7         681         248         148           4

Rrevisar las características del dataset

print(str(df.train))
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 147 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## NULL

Generar un mapa de color con los valores presentes y perdidos Función para aplicar ggplot a missmap

x<-df.train
x %>% 
  is.na %>%
  melt %>%
  ggplot(data = .,
         aes(x = Var2,
             y = Var1)) +
  geom_raster(aes(fill = value)) +
  scale_fill_grey(name = "",
                  labels = c("Presentes","Perdidos")) +
  theme_classic() + 
  theme(axis.text.x  = element_text(angle=45, vjust=0.5)) + 
  labs(x = "Variables en el Dataset",
       y = "Observaciones")

Con el fin de trabajar con entradas más consistentes, se eliminan a continuación las columnas que no aportan información, como el ID (columna 1), Nombre (Columna 4),El número del ticket (Columna 9) y Cabin (Columna 11 con más datos ausentes)

data <- subset(df.train, select = c(2,3,5,6,7,8,10,12))

Nuevamente se genera el mapa de valores perdidos. donde sólo quedan ausentes algunas edades

x<-data
x %>% 
  is.na %>%
  melt %>%
  ggplot(data = .,
         aes(x = Var2,
             y = Var1)) +
  geom_raster(aes(fill = value)) +
  scale_fill_grey(name = "",
                  labels = c("Presentes","Perdidos")) +
  theme_classic() + 
  theme(axis.text.x  = element_text(angle=45, vjust=0.5)) + 
  labs(x = "Variables en el Dataset",
       y = "Observaciones")

El dataset es tranformado mediante el relleno de los valores perdidos de la edad, para este caso, se trabajará con el promedio lo cual se verá reflejado en el histograma de frecuencias

data$Age[is.na(data$Age)] <- round(mean(data$Age,na.rm=T), digits = 1)

Al observar el comportamiento de los datos, se puede apreciar que el promedio tiene un comportamiento dispar a los demás, ya que el mayor recuento de las otras categorías se encuentra en 27, frente a 202; por tanto, se procede a eliminar las respuestas ausentes.

bp_a <- ggplot(data,aes(x=Age, fill = factor(Age)))+geom_histogram(bins= 60, alpha =0.5, show.legend = NA) 
ggplotly()

Eliminación de los registros ausentes para la variable Edad

data <- subset(df.train, select = c(2,3,5,6,7,8,10,12))
data <- data[!is.na(data$Age),]

Eliminación de los registros ausentes para la variable categórica Embarked

data <- data[!is.na(data$Embarked),]
rownames(data) <- NULL

las variables codificadas por R (categóricas), se pueden estudiar mediante la función de contraste

contrasts(data$Sex)
##        male
## female    0
## male      1
contrasts(data$Embarked)
##   Q S
## C 0 0
## Q 1 0
## S 0 1

El comportamiento de las variables predictoras puede ser analizado mediante diagramas de frecuencia, conteo o histogramas Sobrevivió

bp_s <- ggplot(data,aes(x=Survived, fill = factor(Survived))) + geom_bar()
ggplotly()

Clase del tiquete

bp_pc <- ggplot(data,aes(x=Pclass, fill = factor(Pclass)))+geom_bar()
ggplotly()

Género

bp_s <- ggplot(data,aes(x=Sex, fill = factor(Sex)))+geom_bar()
ggplotly()

Edad

bp_a <- ggplot(data,aes(x=Age, fill = factor(Age)))+geom_histogram(bins= 60, alpha =0.5, show.legend = NA) 
ggplotly()

SibSp

bp_si <- ggplot(data,aes(x=SibSp, fill = factor(SibSp)))+geom_bar()
ggplotly()

Parch

bp_pa <- ggplot(data,aes(x=Parch, fill = factor(Parch)))+geom_bar()
ggplotly()

Fare

bp_f <- ggplot(data,aes(x=Fare, fill = factor(Fare)))+geom_histogram(bins= 30, alpha =0.5, show.legend = NA) 
ggplotly()

Construcción del modelo de regresión logístico binario

model <- glm(Survived ~.,family=binomial(link='logit'),data=data)

Salidas del modelo

summary(model)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial(link = "logit"), 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7233  -0.6447  -0.3799   0.6326   2.4457  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.637407   0.634550   8.884  < 2e-16 ***
## Pclass      -1.199251   0.164619  -7.285 3.22e-13 ***
## Sexmale     -2.638476   0.222256 -11.871  < 2e-16 ***
## Age         -0.043350   0.008232  -5.266 1.39e-07 ***
## SibSp       -0.363208   0.129017  -2.815  0.00487 ** 
## Parch       -0.060270   0.123900  -0.486  0.62666    
## Fare         0.001432   0.002531   0.566  0.57165    
## EmbarkedQ   -0.823545   0.600229  -1.372  0.17005    
## EmbarkedS   -0.401213   0.270283  -1.484  0.13770    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 960.90  on 711  degrees of freedom
## Residual deviance: 632.34  on 703  degrees of freedom
## AIC: 650.34
## 
## Number of Fisher Scoring iterations: 5

Teniendo en cuenta lo anterior, se identifican ciertas variables que inciden en la probabilidad de supervivencia al evento. Para el caso puntual de la edad (variable contínua), por cada año de más, la probabilidad de supervivencia disminuye en 0.043 (aproximadamente) De manera similar, al aumentar el tipo de clase (Pclass) indica una menor calidad categoría y por ende, disminuye la probabilidad de supervivencia en 1.20 aproximadamente.

newdata1 <- with(data, data.frame(Pclass=as.integer(factor(rep(1:3, each = 2))), Sex=factor(rep(c("female","male"), each = 6)), Age = mean(Age), SibSp = mean(SibSp),Parch=as.integer( factor(rep(0:2, each = 2))), Fare = mean(Fare),Embarked=factor(rep(c("C","Q","S"), each = 2))))
newdata1$SurvivedP <- predict(model, newdata = newdata1, type = "response")
newdata2 <- with(data, 
                 data.frame(Pclass=as.integer(factor(rep(1:3, each = 20))), 
                      Sex=factor(rep(c("female","male"), each = 60)), 
                      Age = rep(seq(from =min(Age), to= max(Age), length.out = 120)), 
                      SibSp = as.integer( factor(rep(0:3, each = 10))),
                      Parch=as.integer( factor(rep(0:2, each = 20))), 
                      Fare = rep(seq(from =min(Fare), to= max(Fare), length.out = 12)), 
                      Embarked=factor(rep(c("C","Q","S"), each = 20))))
newdata2$SurvivedP <- predict(model, newdata = newdata2, type = "response")
newdata2
##     Pclass    Sex       Age SibSp Parch      Fare Embarked   SurvivedP
## 1        1 female  0.420000     1     1   0.00000        C 0.981951345
## 2        1 female  1.088739     1     1  46.57538        C 0.982607273
## 3        1 female  1.757479     1     1  93.15076        C 0.983239771
## 4        1 female  2.426218     1     1 139.72615        C 0.983849646
## 5        1 female  3.094958     1     1 186.30153        C 0.984437679
## 6        1 female  3.763697     1     1 232.87691        C 0.985004629
## 7        1 female  4.432437     1     1 279.45229        C 0.985551227
## 8        1 female  5.101176     1     1 326.02767        C 0.986078183
## 9        1 female  5.769916     1     1 372.60305        C 0.986586182
## 10       1 female  6.438655     1     1 419.17844        C 0.987075887
## 11       1 female  7.107395     2     1 465.75382        C 0.982191854
## 12       1 female  7.776134     2     1 512.32920        C 0.982839194
## 13       1 female  8.444874     2     1   0.00000        C 0.963923984
## 14       1 female  9.113613     2     1  46.57538        C 0.965211858
## 15       1 female  9.782353     2     1  93.15076        C 0.966455355
## 16       1 female 10.451092     2     1 139.72615        C 0.967655893
## 17       1 female 11.119832     2     1 186.30153        C 0.968814852
## 18       1 female 11.788571     2     1 232.87691        C 0.969933572
## 19       1 female 12.457311     2     1 279.45229        C 0.971013361
## 20       1 female 13.126050     2     1 326.02767        C 0.972055489
## 21       2 female 13.794790     3     2 372.60305        Q 0.757790728
## 22       2 female 14.463529     3     2 419.17844        Q 0.764640561
## 23       2 female 15.132269     3     2 465.75382        Q 0.771355125
## 24       2 female 15.801008     3     2 512.32920        Q 0.777933763
## 25       2 female 16.469748     3     2   0.00000        Q 0.620396706
## 26       2 female 17.138487     3     2  46.57538        Q 0.629231002
## 27       2 female 17.807227     3     2  93.15076        Q 0.637979674
## 28       2 female 18.475966     3     2 139.72615        Q 0.646637840
## 29       2 female 19.144706     3     2 186.30153        Q 0.655200849
## 30       2 female 19.813445     3     2 232.87691        Q 0.663664281
## 31       2 female 20.482185     4     2 279.45229        Q 0.587622385
## 32       2 female 21.150924     4     2 326.02767        Q 0.596723617
## 33       2 female 21.819664     4     2 372.60305        Q 0.605758746
## 34       2 female 22.488403     4     2 419.17844        Q 0.614722146
## 35       2 female 23.157143     4     2 465.75382        Q 0.623608382
## 36       2 female 23.825882     4     2 512.32920        Q 0.632412221
## 37       2 female 24.494622     4     2   0.00000        Q 0.445256823
## 38       2 female 25.163361     4     2  46.57538        Q 0.454583731
## 39       2 female 25.832101     4     2  93.15076        Q 0.463942619
## 40       2 female 26.500840     4     2 139.72615        Q 0.473326974
## 41       3 female 27.169580     1     3 186.30153        S 0.545704347
## 42       3 female 27.838319     1     3 232.87691        S 0.555030167
## 43       3 female 28.507059     1     3 279.45229        S 0.564317390
## 44       3 female 29.175798     1     3 326.02767        S 0.573559704
## 45       3 female 29.844538     1     3 372.60305        S 0.582750923
## 46       3 female 30.513277     1     3 419.17844        S 0.591884998
## 47       3 female 31.182017     1     3 465.75382        S 0.600956039
## 48       3 female 31.850756     1     3 512.32920        S 0.609958324
## 49       3 female 32.519496     1     3   0.00000        S 0.421822510
## 50       3 female 33.188235     1     3  46.57538        S 0.431039956
## 51       3 female 33.856975     2     3  93.15076        S 0.353627710
## 52       3 female 34.525714     2     3 139.72615        S 0.362288738
## 53       3 female 35.194454     2     3 186.30153        S 0.371040124
## 54       3 female 35.863193     2     3 232.87691        S 0.379876980
## 55       3 female 36.531933     2     3 279.45229        S 0.388794200
## 56       3 female 37.200672     2     3 326.02767        S 0.397786470
## 57       3 female 37.869412     2     3 372.60305        S 0.406848279
## 58       3 female 38.538151     2     3 419.17844        S 0.415973928
## 59       3 female 39.206891     2     3 465.75382        S 0.425157550
## 60       3 female 39.875630     2     3 512.32920        S 0.434393119
## 61       1   male 40.544370     3     1   0.00000        C 0.248278905
## 62       1   male 41.213109     3     1  46.57538        C 0.255379162
## 63       1   male 41.881849     3     1  93.15076        C 0.262611536
## 64       1   male 42.550588     3     1 139.72615        C 0.269974470
## 65       1   male 43.219328     3     1 186.30153        C 0.277466162
## 66       1   male 43.888067     3     1 232.87691        C 0.285084562
## 67       1   male 44.556807     3     1 279.45229        C 0.292827365
## 68       1   male 45.225546     3     1 326.02767        C 0.300692012
## 69       1   male 45.894286     3     1 372.60305        C 0.308675685
## 70       1   male 46.563025     3     1 419.17844        C 0.316775312
## 71       1   male 47.231765     4     1 465.75382        C 0.250837129
## 72       1   male 47.900504     4     1 512.32920        C 0.257985437
## 73       1   male 48.569244     4     1   0.00000        C 0.139566033
## 74       1   male 49.237983     4     1  46.57538        C 0.144153518
## 75       1   male 49.906723     4     1  93.15076        C 0.148865703
## 76       1   male 50.575462     4     1 139.72615        C 0.153704260
## 77       1   male 51.244202     4     1 186.30153        C 0.158670766
## 78       1   male 51.912941     4     1 232.87691        C 0.163766696
## 79       1   male 52.581681     4     1 279.45229        C 0.168993415
## 80       1   male 53.250420     4     1 326.02767        C 0.174352167
## 81       2   male 53.919160     1     2 372.60305        Q 0.075101422
## 82       2   male 54.587899     1     2 419.17844        Q 0.077761477
## 83       2   male 55.256639     1     2 465.75382        Q 0.080507547
## 84       2   male 55.925378     1     2 512.32920        Q 0.083341829
## 85       2   male 56.594118     1     2   0.00000        Q 0.040690568
## 86       2   male 57.262857     1     2  46.57538        Q 0.042187401
## 87       2   male 57.931597     1     2  93.15076        Q 0.043736786
## 88       2   male 58.600336     1     2 139.72615        Q 0.045340380
## 89       2   male 59.269076     1     2 186.30153        Q 0.046999880
## 90       2   male 59.937815     1     2 232.87691        Q 0.048717019
## 91       2   male 60.606555     2     2 279.45229        Q 0.035663771
## 92       2   male 61.275294     2     2 326.02767        Q 0.036982818
## 93       2   male 61.944034     2     2 372.60305        Q 0.038348711
## 94       2   male 62.612773     2     2 419.17844        Q 0.039762968
## 95       2   male 63.281513     2     2 465.75382        Q 0.041227145
## 96       2   male 63.950252     2     2 512.32920        Q 0.042742837
## 97       2   male 64.618992     2     2   0.00000        Q 0.020406109
## 98       2   male 65.287731     2     2  46.57538        Q 0.021173233
## 99       2   male 65.956471     2     2  93.15076        Q 0.021968547
## 100      2   male 66.625210     2     2 139.72615        Q 0.022793040
## 101      3   male 67.293950     3     3 186.30153        S 0.007239379
## 102      3   male 67.962689     3     3 232.87691        S 0.007515326
## 103      3   male 68.631429     3     3 279.45229        S 0.007801707
## 104      3   male 69.300168     3     3 326.02767        S 0.008098913
## 105      3   male 69.968908     3     3 372.60305        S 0.008407345
## 106      3   male 70.637647     3     3 419.17844        S 0.008727419
## 107      3   male 71.306387     3     3 465.75382        S 0.009059568
## 108      3   male 71.975126     3     3 512.32920        S 0.009404237
## 109      3   male 72.643866     3     3   0.00000        S 0.004409479
## 110      3   male 73.312605     3     3  46.57538        S 0.004578054
## 111      3   male 73.981345     4     3  93.15076        S 0.003310255
## 112      3   male 74.650084     4     3 139.72615        S 0.003436952
## 113      3   male 75.318824     4     3 186.30153        S 0.003568481
## 114      3   male 75.987563     4     3 232.87691        S 0.003705024
## 115      3   male 76.656303     4     3 279.45229        S 0.003846772
## 116      3   male 77.325042     4     3 326.02767        S 0.003993921
## 117      3   male 77.993782     4     3 372.60305        S 0.004146675
## 118      3   male 78.662521     4     3 419.17844        S 0.004305247
## 119      3   male 79.331261     4     3 465.75382        S 0.004469855
## 120      3   male 80.000000     4     3 512.32920        S 0.004640728
newdata3 <- cbind(newdata2, predict(model, newdata = newdata2, type = "link", se = TRUE))
newdata3 <- within(newdata3, {
  PredictedProb <- plogis(fit)
  LL <- plogis(fit - (1.96 * se.fit))
  UL <- plogis(fit + (1.96 * se.fit))
})
ggplot(newdata3, aes(x = Age, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,ymax = UL, fill = Sex), alpha = 0.2) + geom_line(aes(colour = Sex), size = 1)

ggplotly()
ggplot(newdata3, aes(x = Age, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,ymax = UL, fill = Sex), alpha = 0.2) + geom_line(aes(colour = Sex), size = 1)

ggplotly()
ggplot(newdata3, aes(x = Age, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,ymax = UL, fill = Embarked), alpha = 0.2) + geom_line(aes(colour = Embarked), size = 1)

ggplotly()
ggplot(newdata3, aes(x = Parch, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,ymax = UL, fill = Embarked), alpha = 0.2) + geom_line(aes(colour = Embarked), size = 1)

ggplotly()